library(readxl)
data0 <- read_excel("E:\\data\\dunnhumby_Breakfast-at-the-Frat\\dunnhumby _Breakfast-at-the-Frat\\dunnhumby - Breakfast at the Frat.xlsx", sheet = 4)
data1 <- na.omit(data0)
product <- read_excel("E:\\data\\dunnhumby_Breakfast-at-the-Frat\\dunnhumby _Breakfast-at-the-Frat\\dunnhumby - Breakfast at the Frat.xlsx", sheet = 3)
trans_product <- merge(data1, product, by = "UPC")  # 合并transcation和product表
table(product$SUB_CATEGORY)
## 
##                ADULT CEREAL           ALL FAMILY CEREAL 
##                           3                           7 
##                 KIDS CEREAL MOUTHWASH/RINSES AND SPRAYS 
##                           5                           5 
##    MOUTHWASHES (ANTISEPTIC)               PIZZA/PREMIUM 
##                           8                          15 
##                    PRETZELS 
##                          15
## 对每一个sub_c&每一天的所有产品spend值求和
library(plyr)
library(magrittr)
library(xts)
library(dygraphs)
mydata <- ddply(trans_product, .(WEEK_END_DATE,SUB_CATEGORY), function(x) {c(sum(x$SPEND), mean(x$PRICE), sum(x$FEATURE), sum(x$DISPLAY), sum(x$TPR_ONLY))})
## 取出每一个sub_c
sub1 <- mydata[mydata$SUB_CATEGORY == "ADULT CEREAL",]
sub2 <- mydata[mydata$SUB_CATEGORY == "ALL FAMILY CEREAL",]
sub3 <- mydata[mydata$SUB_CATEGORY == "KIDS CEREAL",]
sub4 <- mydata[mydata$SUB_CATEGORY == "MOUTHWASH/RINSES AND SPRAYS",]
sub5 <- mydata[mydata$SUB_CATEGORY == "MOUTHWASHES (ANTISEPTIC)",]
sub6 <- mydata[mydata$SUB_CATEGORY == "PIZZA/PREMIUM",]
sub7 <- mydata[mydata$SUB_CATEGORY == "PRETZELS",]
## 把spend转化为ts形式
sub1_spend <- ts(sub1$V1, frequency = 52, start = c(2009,2))
sub2_spend <- ts(sub2$V1, frequency = 52, start = c(2009,2))
sub3_spend <- ts(sub3$V1, frequency = 52, start = c(2009,2))
sub4_spend <- ts(sub4$V1, frequency = 52, start = c(2009,2))
sub5_spend <- ts(sub5$V1, frequency = 52, start = c(2009,2))
sub6_spend <- ts(sub6$V1, frequency = 52, start = c(2009,2))
sub7_spend <- ts(sub7$V1, frequency = 52, start = c(2009,2))
## 把price转化为ts形式
sub1_price <- ts(sub1$V2, frequency = 52, start = c(2009,2))
sub2_price <- ts(sub2$V2, frequency = 52, start = c(2009,2))
sub3_price <- ts(sub3$V2, frequency = 52, start = c(2009,2))
sub4_price <- ts(sub4$V2, frequency = 52, start = c(2009,2))
sub5_price <- ts(sub5$V2, frequency = 52, start = c(2009,2))
sub6_price <- ts(sub6$V2, frequency = 52, start = c(2009,2))
sub7_price <- ts(sub7$V2, frequency = 52, start = c(2009,2))
## 计算任意两个sub_C之间的price gap
gap_12 <- sub2_price - sub1_price
gap_13 <- sub3_price - sub1_price
gap_14 <- sub4_price - sub1_price
gap_15 <- sub5_price - sub1_price
gap_16 <- sub6_price - sub1_price
gap_17 <- sub1_price - sub7_price
gap_23 <- sub2_price - sub3_price
gap_24 <- sub4_price - sub2_price
gap_25 <- sub5_price - sub2_price
gap_26 <- sub6_price - sub2_price
gap_27 <- sub2_price - sub7_price
gap_34 <- sub4_price - sub3_price
gap_35 <- sub5_price - sub3_price
gap_36 <- sub6_price - sub3_price
gap_37 <- sub3_price - sub7_price
gap_45 <- sub4_price - sub5_price
gap_46 <- sub6_price - sub4_price
gap_47 <- sub4_price - sub7_price
gap_56 <- sub6_price - sub5_price
gap_57 <- sub5_price - sub7_price
gap_67 <- sub6_price - sub7_price
temp_date <- sub1$WEEK_END_DATE


PriceGap <- function(spend1, spend2, pricegap) {
  ts_group <- cbind(spend1, spend2, pricegap)
  xts_group <- xts(ts_group, temp_date) %>% as.xts()
  title <- paste0(substitute(pricegap) %>% as.character()," between SUB CATEGORY")
  dygraph(xts_group, main = title) %>%
    dySeries(substitute(pricegap) %>% as.character(), axis = 'y2') %>%
    dyOptions(strokeWidth = 2) %>%
    dyRangeSelector()
}

通过分析price gap和spend的变化趋势,来初步判断任意两个sub categories之间是否有关联。为了避免多余的工作,首先凭借经验判断,CEREAL之间是有关系的,MOUTHWASH之间是有关的,

3种CEREAL之间,123分别是ADULT, ALL, KIDS

PriceGap(sub1_spend, sub2_spend, gap_12)
PriceGap(sub2_spend, sub3_spend, gap_23)
PriceGap(sub1_spend, sub3_spend, gap_13)

成人麦片&全家麦片:在PRICE GAP比较小的时候(成人比全家贵),全家麦片的销售额更大(猜想:成人小孩都能吃),GAP比较大时(售价全家>成人),成人和全家的麦片都销售额较低。

小孩麦片&全家麦片:当全家麦片的价格更高时(GAP峰值),小孩买票的销售额增高,全家麦片降低;反之全家麦片的销售额增高。

小孩&成人:小孩麦片比较贵时(GAP峰值),ADULT麦片的销量上升,KIDS下降;反之KIDS上升,ADULT下降

总结:三种麦片之间存在替代关系。

2种漱口水,45分别是漱口喷雾和防腐剂漱口水

PriceGap(sub4_spend, sub5_spend, gap_45)

含防腐剂的漱口水销售额整体比另一种漱口喷雾要低,理论上当喷雾比防腐剂漱口水的价格GAP更高时,应该出现防腐剂漱口水的销售额上升,但是图中并没有这样的规律,因此很可能漱口水不是顾客主动选择的,而是被随机放进一个basket里的附带物品(因此同时推测所有其他的食物和两种漱口水都是互补关系)。并且认为两种漱口水之间没有替代关系。

PIZZA和PRETZELS,分别对应67

PriceGap(sub6_spend, sub7_spend, gap_67)

PIZZA的价格上升时,出现PIZZA销售额下降,但是不伴随PRETZELS的销售额上升,说明二者不存在替代关系。

CEREAL和PRETZELS

PriceGap(sub1_spend, sub7_spend, gap_17)
PriceGap(sub2_spend, sub7_spend, gap_27)
PriceGap(sub3_spend, sub7_spend, gap_37)

成人麦片&PRETZELS:成人麦片便宜时,其自身销售额提高,但是PRETZELS没有明显的销售额下降,二者不存在替代关系;

全家麦片和PRETZELS存在替代关系;小孩麦片和PRETZELS没有替代关系。

CEREAL和PIZZA

PriceGap(sub1_spend, sub6_spend, gap_16)
PriceGap(sub2_spend, sub6_spend, gap_26)
PriceGap(sub3_spend, sub6_spend, gap_36)

成人麦片和PIZZA有替代关系;全家麦片和PIZZA没有替代关系;小孩麦片和PIZZA好像也没关系。。。

打折点的影响

PEREAL

从每个产品的角度来看,打折是一些偶然发生的事件;但是从一个subC的角度,几乎天天都有打折。所以每个sub C中随机选取一个店铺的一个产品来观察。

library(lubridate)
library(plotly)
product1 <- data1[data1$UPC == data1$UPC[1] & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product1, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
feature <- product1[which(product1$FEATURE != 0),]
feature_time <- feature$WEEK_END_DATE
text <- rep("FEA", length(feature_time))
text <- paste0(text,"_",1:length(feature_time))
add_trace(p, x = feature$WEEK_END_DATE, y = feature$SPEND, type = "scatter", mode = "markers", name = "FEATURE") %>%
  layout(annotations = list(x = feature$WEEK_END_DATE, y = feature$SPEND, text = text, showarrow = F))
display <- product1[which(product1$DISPLAY != 0),]
display_time <- display$WEEK_END_DATE
text <- rep("DIS", length(display_time))
text <- paste0(text,"_",1:length(display_time))
add_trace(p, x = display$WEEK_END_DATE, y = display$SPEND, type = "scatter", mode = "markers", name = "DISPLAY") %>%
  layout(annotations = list(x = display$WEEK_END_DATE, y = display$SPEND, text = text, showarrow = F))

DISPLAY有影响,和销售额的峰值匹配

tpr <- product1[which(product1$TPR_ONLY != 0),]
tpr_time <- tpr$WEEK_END_DATE
text <- rep("TPR", length(tpr_time))
text <- paste0(text,"_",1:length(tpr_time))
add_trace(p, x = tpr$WEEK_END_DATE, y = tpr$SPEND, type = "scatter", mode = "markers", name = "TPR_ONLY") %>%
  layout(annotations = list(x = tpr$WEEK_END_DATE, y = tpr$SPEND, text = text, showarrow = F))

TPR_ONLY的促销方式在SPEND值的变化上没有明显的规律。

ADULT CEREAL

product2 <- data1[data1$UPC == 88491201426 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product2, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
## 标点函数
PointFeature <- function(data) {
  feature <- data[which(data$FEATURE != 0),]
  feature_time <- feature$WEEK_END_DATE
  text <- rep("FEA", length(feature_time))
  text <- paste0(text,"_",1:length(feature_time))
  add_trace(p, x = feature$WEEK_END_DATE, y = feature$SPEND, type = "scatter", mode = "markers", name = "FEATURE") %>% 
    layout(annotations = list(x = feature$WEEK_END_DATE, y = feature$SPEND, text = text, showarrow = F))
}
PointDisplay <- function(data) {
  display <- data[which(data$DISPLAY != 0),]
  display_time <- display$WEEK_END_DATE
  text <- rep("DIS", length(display_time))
  text <- paste0(text,"_",1:length(display_time))
  add_trace(p, x = display$WEEK_END_DATE, y = display$SPEND, type = "scatter", mode = "markers", name = "DISPLAY") %>%
    layout(annotations = list(x = display$WEEK_END_DATE, y = display$SPEND, text = text, showarrow = F))
}
PointTPR <- function(data) {
  tpr <- data[which(data$TPR_ONLY != 0),]
  tpr_time <- tpr$WEEK_END_DATE
  text <- rep("TPR", length(tpr_time))
  text <- paste0(text,"_",1:length(tpr_time))
  add_trace(p, x = tpr$WEEK_END_DATE, y = tpr$SPEND, type = "scatter", mode = "markers", name = "TPR_ONLY") %>%
    layout(annotations = list(x = tpr$WEEK_END_DATE, y = tpr$SPEND, text = text, showarrow = F))
}
PointFeature(product2)

DISPLAY有影响,和销售额的峰值匹配

PointDisplay(product2)
PointTPR(product2)

似乎三种打折方式都会对销售有影响。

ALL FAMILY CEREAL

product3 <- data1[data1$UPC == 3000006340 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product3, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
PointFeature(product3)
PointDisplay(product3)
PointTPR(product3)

TPR还是没什么规律。

KIDS CEREAL

product4 <- data1[data1$UPC == 3800039118 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product4, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
PointFeature(product4)
PointDisplay(product4)
PointTPR(product4)

TPR出现在销售的低谷(考虑TPR的原因),因此不对销售产生明显的影响。

MOUTHWASH/RINSES

product5 <- data1[data1$UPC == 3700031613 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product5, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
PointFeature(product5)
# PointDisplay(product5)
PointTPR(product5)

没有DISPLAY的记录,FEATURE出现在峰值,TPR混乱无影响。

MOUTHWASH(ANTISEPTIC)

product6 <- data1[data1$UPC == 1111038080 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product6, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
PointFeature(product6)
# PointDisplay(product6)
PointTPR(product6)

没有DISPLAY的记录,FEATURE出现在峰值,TPR混乱无影响。

PIZZA

product7 <- data1[data1$UPC == 1111087395 & data1$STORE_NUM == data1$STORE_NUM[1],]
p <- plot_ly(product7, x = ~WEEK_END_DATE, y = ~SPEND, type = "scatter", mode = "lines", name = "销售额") %>%
  layout(title = "", showticklables = T, tickfont = list(size = 8))
p
PointFeature(product7)
PointDisplay(product7)
PointTPR(product7)

FEATURE和DISPLAY出现在峰值,TPR在谷底,前两个是对销售产生影响的。